# install.packages("readxl")
# install.packages("tidyverse")
# install.packages("plotly")
# install.packages("rmarkdown")


library(readxl)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(plotly)
## Loading required package: ggplot2
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(rmarkdown)
## Warning: package 'rmarkdown' was built under R version 4.3.3
# Replace 'your_file.xlsx' with the path to your file
file_path <- "~/Downloads/DownloadTable_StatePayerED_8-29-24.xlsx"

# Read the "data" sheet
data <- read_excel(file_path, sheet = "Data")

# Count the number of cells in the dataframe that contain exactly 0. 
zero_count <- sum(data == 0, na.rm = TRUE)

# Print the result
print(paste("Number of cells with just 0:", zero_count))
## [1] "Number of cells with just 0: 0"
#Checking if any states have Age == "Adult and Pediatric" data but missing data for either Age == "Adult" or Age == "Pediatric": 
# Filter relevant data
age_data <- data %>%
  filter(Age %in% c("Pediatric", "Adult", "Adult and Pediatric"))

# Summarize presence of Age categories per State
age_summary <- age_data %>%
  group_by(State) %>%
  summarize(
    has_pediatric = any(Age == "Pediatric"),
    has_adult = any(Age == "Adult"),
    has_adult_and_pediatric = any(Age == "Adult and Pediatric"),
    .groups = "drop"
  )

# Identify problematic states
problem_states <- age_summary %>%
  filter(
    has_adult_and_pediatric & (!has_pediatric | !has_adult)
  )

# View problematic states
print(problem_states) #Looks good, I can proceed with calculation below: 
## # A tibble: 0 × 4
## # ℹ 4 variables: State <chr>, has_pediatric <lgl>, has_adult <lgl>,
## #   has_adult_and_pediatric <lgl>
# Filter data based on the conditions
filtered_data <- data %>%
  filter(Age %in% c("Pediatric", "Adult"),
         `Expected Payer` == "All expected payers^",
         `Hospitalization Type` %in% c("ED Visits", "Mental Health/Substance Use ED Visits"))

# Gather quarterly columns into long format for easier manipulation
long_data <- filtered_data %>%
  pivot_longer(cols = starts_with("20"),  # Match all columns starting with year
               names_to = "Quarter",
               values_to = "Count")

# Extract year from quarter
long_data <- long_data %>%
  mutate(Year = sub(" Q[1-4]", "", Quarter))  # Remove quarter to keep the year

# Summarize data by State, Year, and Hospitalization Type
annual_data <- long_data %>%
  group_by(State, Year, `Hospitalization Type`) %>%
  summarize(
    Total_Visits = if (all(is.na(Count))) NA else sum(Count, na.rm = TRUE),
    .groups = "drop"
  )
# Pivot wider to create separate columns for each year and hospitalization type
final_data <- annual_data %>%
  pivot_wider(names_from = c(Year, `Hospitalization Type`),
              values_from = Total_Visits,
              names_glue = "Total {`Hospitalization Type`}_{Year}") 

# Create a list of the columns for Mental Health/Substance Use Visits and ED Visits
mental_health_cols <- grep("Mental Health/Substance Use ED Visits", names(final_data), value = TRUE)
ed_visit_cols <- gsub("Mental Health/Substance Use ED Visits", "ED Visits", mental_health_cols)

# Calculate the proportion of mental health/substance use visits for each year
for (i in seq_along(mental_health_cols)) {
  year <- sub("Total Mental Health/Substance Use ED Visits_", "", mental_health_cols[i])
  ed_col <- paste0("Total ED Visits_", year)
  mh_col <- mental_health_cols[i]
  
  final_data <- final_data %>%
    mutate(!!paste0("Proportion_Mental_Health_", year) := 
             get(mh_col) / get(ed_col))
}
# Extract columns starting with "Proportion_Mental_Health_" followed by year
proportion_MH_cols <- grep("^Proportion_Mental_Health_", names(final_data), value = TRUE)

# Reshape data into a long format
proportion_MH_long_data <- final_data %>%
  select(State, all_of(proportion_MH_cols)) %>%
  pivot_longer(cols = -State, 
               names_to = "Year", 
               values_to = "Proportion") %>%
  mutate(Year = as.numeric(gsub("Proportion_Mental_Health_", "", Year)))

# Identify states for minimum and maximum values
summary_stats <- proportion_MH_long_data %>%
  group_by(Year) %>%
  summarise(
    Min = min(Proportion, na.rm = TRUE),
    Max = max(Proportion, na.rm = TRUE),
    Mean = mean(Proportion, na.rm = TRUE),
    Min_State = State[which.min(Proportion)],
    Max_State = State[which.max(Proportion)]
  )

# Add states to the long data for plotting
plot_data <- summary_stats %>%
  pivot_longer(
    cols = c("Min", "Max", "Mean"),
    names_to = "Statistic",
    values_to = "Value"
  ) %>%
  mutate(
    State = case_when(
      Statistic == "Min" ~ Min_State,
      Statistic == "Max" ~ Max_State,
      TRUE ~ NA_character_
    )
  )
# Filter out rows with NA values in the `Value` column
filtered_plot_data <- plot_data %>%
  filter(!is.na(Value))

# Define breaks for the Year and Proportion axes
p1.year_breaks <- seq(min(filtered_plot_data$Year), max(filtered_plot_data$Year), by = 1)
p1.proportion_breaks <- pretty(filtered_plot_data$Value)

# Create the plot using the filtered data
p1 <- ggplot(filtered_plot_data, aes(x = Year, y = Value, color = Statistic, group = Statistic)) +
  geom_line(size = 1) +
  geom_point(aes(
    text = paste0(
      "Year: ", Year,
      "<br>Statistic: ", Statistic,
      "<br>Value: ", round(Value, 2),
      ifelse(!is.na(State), paste0("<br>State: ", State), "")
    )
  )) +
  scale_x_continuous(breaks = p1.year_breaks) +
  scale_y_continuous(breaks = p1.proportion_breaks) +
  labs(
    title = "Proportion of Emergency Department Utilization for Mental Health/Substance Use Reasons Across N = 37 States",
    x = "Year",
    y = "Proportion",
    color = "Statistic"
  ) +
  theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning in geom_point(aes(text = paste0("Year: ", Year, "<br>Statistic: ", :
## Ignoring unknown aesthetics: text
# Convert to interactive plotly object
p1.interactive_plot <- ggplotly(p1, tooltip = "text", width = 1000, height = 600) %>%
  layout(
    title = list(
      font = list(size = 15)  # Adjust title font size if needed
    ),
    xaxis = list(
      tickangle = 45  # Tilt x-axis labels by 45 degrees
    )
  )


# Show the interactive plot
p1.interactive_plot